import pandas as pd
import numpy as np
import warnings
import re
import nltk
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, roc_curve,roc_auc_score, \
classification_report, confusion_matrix, \
precision_recall_curve, precision_score, \
f1_score, fbeta_score
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler
import scikitplot as skplt
import shap
import plotly.express as px
from imblearn.over_sampling import SMOTE
from sklearn.inspection import PartialDependenceDisplay
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)
def get_models_score(classifiers, X_train, Y_train):
for classifier in classifiers:
classifier.fit(X_train, Y_train)
y_pred = classifier.predict(X_test)
print(classifier)
print("model score: %.3f" % classifier.score(X_test, Y_test))
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred))
def get_model_report(Y_test, y_pred):
print("Classification Report")
print(classification_report(Y_test, y_pred))
def get_confusion_matrix(model, X_test, Y_test):
titles_options = [("Confusion matrix, without normalization", None),
("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
disp = plot_confusion_matrix(model, X_test, Y_test,
cmap=plt.cm.Blues,
normalize=normalize)
disp.ax_.set_title(title)
print(title)
print(disp.confusion_matrix)
plt.show()
def transform_ratings(row):
if row['stars_x'] <= 3:
return 0
else:
return 1
def get_roc_curves(ytest, yhat):
fpr, tpr, thresholds = roc_curve(ytest, yhat)
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(fpr, tpr, marker='.', label='LGBM')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()
gmeans = np.sqrt(tpr * (1-fpr))
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))
plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
plt.plot(fpr, tpr, marker='.', label='LGBM')
plt.scatter(fpr[ix], tpr[ix], s=100, marker='o', color='black', label='Best')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()
def get_precission_recall_curve(ytest):
no_skill = len(ytest[ytest==1]) / len(ytest)
plt.plot([0,1], [no_skill,no_skill], linestyle='--', label='No Skill')
plt.plot(recall, precision, marker='.', label='LGBM')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.show()
fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))
no_skill = len(ytest[ytest==1]) / len(ytest)
plt.plot([0,1], [no_skill,no_skill], linestyle='--', label='No Skill')
plt.plot(recall, precision, marker='.', label='LGBM')
plt.scatter(recall[ix], precision[ix], s=100, marker='o', color='black', label='Best')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.show()
def get_pie_chart(df):
fig_1 = px.pie(df,
names='index',
height=400,
width=600,
hole=0.4,
title='Nightlife Businesses Stars Distribution',
values='percent',
color_discrete_sequence=['#2AA10F','#92E000','#E1FF00','#F58B00','#DE3700']
)
fig_1.update_layout(legend=dict(orientation='h', yanchor='bottom', y=-0.2, xanchor='center', x=0.5))
fig_1.show()
nightlife_businesses_path = '../data/Processed/df_nightlife_PA_reviews.csv'
seed = 123456
beta = 2
ind = 6
warnings.filterwarnings('ignore')
nightlife_businesses_PA = pd.read_csv(nightlife_businesses_path)
nightlife_businesses_PA.head()
| review_id | business_id | stars_x | useful | funny | cool | date | keywords | BusinessAcceptsCreditCards | RestaurantsDelivery | OutdoorSeating | BikeParking | RestaurantsPriceRange2 | RestaurantsTakeOut | ByAppointmentOnly | WiFi | Alcohol | Caters | WheelchairAccessible | GoodForKids | RestaurantsAttire | RestaurantsReservations | CoatCheck | DogsAllowed | RestaurantsTableService | RestaurantsGoodForGroups | HasTV | HappyHour | DriveThru | NoiseLevel | BusinessAcceptsBitcoin | AcceptsInsurance | Smoking | GoodForDancing | BYOB | Corkage | BYOBCorkage | HairSpecializesIn | Open24Hours | RestaurantsCounterService | AgesAllowed | DietaryRestrictions | Monday | Tuesday | Wednesday | Thursday | Friday | Saturday | Sunday | name | postal_code | latitude | longitude | stars_y | review_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | oyaMhzBSwfGgemSGuZCdwQ | YtSqYv1Q_pOltsVPSx54SA | 5.0 | 0.0 | 0.0 | 0.0 | 2013-06-24 11:21:25 | ['Tremendous', 'service', 'Big', 'shout', 'Dou... | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 1 | iOQ_bnKI5HfPbH43DMAw6w | YtSqYv1Q_pOltsVPSx54SA | 3.0 | 0.0 | 0.0 | 0.0 | 2013-01-27 19:22:26 | ['good', 'place', 'lofty', 'prices', 'proporti... | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 2 | rzrBiijeQh7ubjfRCr-UtA | YtSqYv1Q_pOltsVPSx54SA | 4.0 | 12.0 | 11.0 | 11.0 | 2008-04-30 15:26:12 | ['bar', 'area', 'upscale', 'cities', 'restaura... | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 3 | 1HP3yZN3jT646IlHSo7GZw | YtSqYv1Q_pOltsVPSx54SA | 5.0 | 0.0 | 0.0 | 0.0 | 2014-06-11 16:10:04 | ['prime', 'rib', 'steak', 'joints', 'experience'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 4 | Vv6acqoztdtzTD8Gq0gifA | YtSqYv1Q_pOltsVPSx54SA | 5.0 | 0.0 | 0.0 | 0.0 | 2018-03-04 00:43:27 | ['name', 'Best', 'Prime', 'Rib', 'town', 'Serv... | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
We have imported the nightlife businesses PA dataset , which contains all the data related to the nightlife businesses in the state of Pennsylvania.
According to the use case that we have proposed, which is, predicting the rating of a review based on the comments of the review itself, we have decided to extract the most important keywords from the text to eliminate the noise it may have. In this way, we believe that we will facilitate the work to the model in order to predict the rating.
We believe that predicting the rating of a business would be counterproductive, since this rating is calculated based on the average of all the reviews of the business. Therefore, it would not make sense to use the rating of the business as the objective variable, if not the rating of the review. This would be a more realistic goal. Therefore, for the model, as the objective variable we will use the variable stars_x that collects the ratings of the reviews.
print("Total No. of Reviews: {}".format(nightlife_businesses_PA.shape[0]))
Total No. of Reviews: 261678
We have a total of 261678 reviews.
nightlife_businesses_PA_stars = nightlife_businesses_PA["stars_x"].value_counts(normalize=True).mul(100).rename('percent').reset_index()
nightlife_businesses_PA_count = nightlife_businesses_PA["stars_x"].value_counts().reset_index()
nightlife_businesses_PA_count_pct = pd.merge(nightlife_businesses_PA_stars, nightlife_businesses_PA_count, on=['index'], how='inner')
nightlife_businesses_PA_count_pct
| index | percent | stars_x | |
|---|---|---|---|
| 0 | 5.0 | 37.475065 | 98064 |
| 1 | 4.0 | 28.313806 | 74091 |
| 2 | 3.0 | 13.955701 | 36519 |
| 3 | 1.0 | 10.879019 | 28468 |
| 4 | 2.0 | 9.376409 | 24536 |
get_pie_chart(nightlife_businesses_PA_count_pct)
nightlife_businesses_PA["stars_x"].value_counts()
5.0 98064 4.0 74091 3.0 36519 1.0 28468 2.0 24536 Name: stars_x, dtype: int64
We have a clear predominance of reviews with a rating between 4 and 5, which would represent a good comment from the user.
To facilitate the work of the model, we are going to convert this variable to an integer type since the model only accepts values of this type.
nightlife_businesses_PA['stars_x'] = nightlife_businesses_PA['stars_x'].round().astype(int)
nightlife_businesses_PA.head(50)
| review_id | business_id | stars_x | useful | funny | cool | date | keywords | BusinessAcceptsCreditCards | RestaurantsDelivery | OutdoorSeating | BikeParking | RestaurantsPriceRange2 | RestaurantsTakeOut | ByAppointmentOnly | WiFi | Alcohol | Caters | WheelchairAccessible | GoodForKids | RestaurantsAttire | RestaurantsReservations | CoatCheck | DogsAllowed | RestaurantsTableService | RestaurantsGoodForGroups | HasTV | HappyHour | DriveThru | NoiseLevel | BusinessAcceptsBitcoin | AcceptsInsurance | Smoking | GoodForDancing | BYOB | Corkage | BYOBCorkage | HairSpecializesIn | Open24Hours | RestaurantsCounterService | AgesAllowed | DietaryRestrictions | Monday | Tuesday | Wednesday | Thursday | Friday | Saturday | Sunday | name | postal_code | latitude | longitude | stars_y | review_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | oyaMhzBSwfGgemSGuZCdwQ | YtSqYv1Q_pOltsVPSx54SA | 5 | 0.0 | 0.0 | 0.0 | 2013-06-24 11:21:25 | ['Tremendous', 'service', 'Big', 'shout', 'Dou... | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 1 | iOQ_bnKI5HfPbH43DMAw6w | YtSqYv1Q_pOltsVPSx54SA | 3 | 0.0 | 0.0 | 0.0 | 2013-01-27 19:22:26 | ['good', 'place', 'lofty', 'prices', 'proporti... | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 2 | rzrBiijeQh7ubjfRCr-UtA | YtSqYv1Q_pOltsVPSx54SA | 4 | 12.0 | 11.0 | 11.0 | 2008-04-30 15:26:12 | ['bar', 'area', 'upscale', 'cities', 'restaura... | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 3 | 1HP3yZN3jT646IlHSo7GZw | YtSqYv1Q_pOltsVPSx54SA | 5 | 0.0 | 0.0 | 0.0 | 2014-06-11 16:10:04 | ['prime', 'rib', 'steak', 'joints', 'experience'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 4 | Vv6acqoztdtzTD8Gq0gifA | YtSqYv1Q_pOltsVPSx54SA | 5 | 0.0 | 0.0 | 0.0 | 2018-03-04 00:43:27 | ['name', 'Best', 'Prime', 'Rib', 'town', 'Serv... | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 5 | 8CF6b3nrnAohDawiKv87TA | YtSqYv1Q_pOltsVPSx54SA | 5 | 0.0 | 0.0 | 0.0 | 2016-02-23 19:40:10 | ['Prime', 'Rib', 'offer', 'cut'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 6 | E6qHEOzFhGiYAn5cgzbZkQ | YtSqYv1Q_pOltsVPSx54SA | 2 | 3.0 | 0.0 | 0.0 | 2010-07-07 21:29:29 | ['nothing', 'thats', 'food'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 7 | vftJTlt7KVMtCRbcBZNSwg | YtSqYv1Q_pOltsVPSx54SA | 5 | 1.0 | 0.0 | 0.0 | 2014-12-18 20:47:25 | ['time', 'cut', 'wine', 'full', 'boyfriend'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 8 | wdcjv9W7RodJofnvBzK6FQ | YtSqYv1Q_pOltsVPSx54SA | 5 | 1.0 | 0.0 | 0.0 | 2014-09-10 13:25:46 | ['friend', 'food', 'best', 'Phila', 'night'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 9 | nd29xztgxcH1cV0Srf9WAw | YtSqYv1Q_pOltsVPSx54SA | 1 | 2.0 | 3.0 | 0.0 | 2016-10-02 00:32:34 | ['place', 'negative', 'stars'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 10 | IY7i04LMohgO7HIk5pYvew | YtSqYv1Q_pOltsVPSx54SA | 5 | 2.0 | 2.0 | 2.0 | 2005-09-23 13:05:56 | ['fact', 'part', 'small'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 11 | r0Rg8B4XDsBf8KomsPijmw | YtSqYv1Q_pOltsVPSx54SA | 5 | 2.0 | 1.0 | 0.0 | 2013-01-28 02:15:10 | ['restaurant', 'week', 'place', 'years', 'times'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 12 | RWSb10J0em22lTpFhZ5QZQ | YtSqYv1Q_pOltsVPSx54SA | 3 | 0.0 | 0.0 | 0.0 | 2017-11-07 14:03:03 | ['crab', 'food', 'fine', 'cakes'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 13 | A50zoocWps2aiZzbDCk6Iw | YtSqYv1Q_pOltsVPSx54SA | 2 | 5.0 | 2.0 | 4.0 | 2009-08-07 15:17:30 | ['good', 'fancy', 'half', 'cut'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 14 | UBSnRKNmhyKdx-PTn8abUA | YtSqYv1Q_pOltsVPSx54SA | 2 | 1.0 | 0.0 | 0.0 | 2013-08-16 04:34:42 | ['way', 'food', 'date', 'place', 'server', 'me... | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 15 | D599DF6m2Z6XC7R1e4xHkg | YtSqYv1Q_pOltsVPSx54SA | 5 | 1.0 | 1.0 | 0.0 | 2017-02-20 03:56:36 | ['salad', 'prime', 'rib', 'delicious'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 16 | q1bg-VKtl0YHpqwOYfbTzg | YtSqYv1Q_pOltsVPSx54SA | 4 | 1.0 | 0.0 | 0.0 | 2016-01-20 02:08:26 | ['great', 'prime', 'rib', 'month', 'chocolate'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 17 | mo3pqyAumP4BG4Nz48FiwA | YtSqYv1Q_pOltsVPSx54SA | 4 | 0.0 | 0.0 | 0.0 | 2016-06-25 12:10:26 | ['BAR', 'expensive', 'worth', 'account'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 18 | jZizwcu-gH9MYrLpXOcauw | YtSqYv1Q_pOltsVPSx54SA | 5 | 0.0 | 0.0 | 0.0 | 2017-12-30 22:53:46 | ['fantastic', 'wonderful', 'style', 'steakhous... | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 19 | UG8HGpvSl8GWlZnHAUHZPg | YtSqYv1Q_pOltsVPSx54SA | 5 | 0.0 | 2.0 | 1.0 | 2014-05-28 12:21:15 | ['food', 'kids'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 20 | GmqFBtJkxbh1UjTDPnBJJQ | YtSqYv1Q_pOltsVPSx54SA | 2 | 0.0 | 0.0 | 0.0 | 2017-12-04 14:22:57 | ['nice', 'service', 'time', 'Philly', 'OK'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 21 | BqtVaxlau4qV8cqtEFDd4w | YtSqYv1Q_pOltsVPSx54SA | 3 | 2.0 | 2.0 | 2.0 | 2009-10-26 04:41:23 | ['steak', 'please', 'medium', 'rare', 'LOVE'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 22 | RvZHRve8_DHD957g2Mlpwg | YtSqYv1Q_pOltsVPSx54SA | 4 | 0.0 | 0.0 | 0.0 | 2012-04-01 20:25:52 | ['good', 'business', 'review', 'old', 'money',... | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 23 | cvExhHIpO4fXilY9hGi5Kw | YtSqYv1Q_pOltsVPSx54SA | 1 | 0.0 | 0.0 | 0.0 | 2015-09-02 01:04:11 | ['general', 'manager', 'disrespectful', 'food'... | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 24 | rFEvx_pu0Y0JtEZuzBzTQA | YtSqYv1Q_pOltsVPSx54SA | 4 | 0.0 | 0.0 | 0.0 | 2014-09-29 13:48:42 | ['tender', 'good', 'place', 'look'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 25 | XdP_JZI9jESTPqW8YBWf-Q | YtSqYv1Q_pOltsVPSx54SA | 5 | 0.0 | 0.0 | 0.0 | 2012-04-04 10:02:09 | ['trip', 'Prime', 'Rib', 'meal', 'selection', ... | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 26 | I6nS0j-m1ifQHWq4C2mzLg | YtSqYv1Q_pOltsVPSx54SA | 5 | 4.0 | 2.0 | 3.0 | 2011-01-18 20:27:05 | ['steak', 'prime', 'rib', 'Philly', 'good'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 27 | I1IKWTGEZF1vxFNo_G8f5A | YtSqYv1Q_pOltsVPSx54SA | 5 | 0.0 | 0.0 | 0.0 | 2015-05-02 02:13:32 | ['Best', 'Prime', 'rib', 'Everything', 'Love',... | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 28 | NW8H4MU1uz-xx18CZJKy7A | YtSqYv1Q_pOltsVPSx54SA | 5 | 0.0 | 0.0 | 0.0 | 2012-09-16 22:34:05 | ['nights', 'favorite', 'prime', 'rib', 'potato'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 29 | -5Ej9ImJ51-WCmX4fyvETA | YtSqYv1Q_pOltsVPSx54SA | 2 | 0.0 | 3.0 | 0.0 | 2013-05-31 12:20:34 | ['full', 'old', 'men', 'Tony', 'Soprano', 'liv... | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 30 | OUWmGzDpvFbW3MdZkVeDNQ | YtSqYv1Q_pOltsVPSx54SA | 3 | 0.0 | 0.0 | 0.0 | 2013-08-05 12:54:01 | ['good', 'wearing', 'waiter', 'table', 'captai... | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 31 | mQ0YpErPdsHTd2KXPBhF2Q | YtSqYv1Q_pOltsVPSx54SA | 1 | 0.0 | 0.0 | 0.0 | 2016-03-19 01:10:23 | ['mice', 'scurry', 'floor', 'server'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 32 | W0rfL71LsLlV7Jl6SEoR6Q | YtSqYv1Q_pOltsVPSx54SA | 4 | 0.0 | 0.0 | 0.0 | 2011-10-07 13:32:08 | ['food', 'Antonio'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 33 | ZdIu2b27brSWFqG_pJDLeA | YtSqYv1Q_pOltsVPSx54SA | 5 | 4.0 | 1.0 | 2.0 | 2014-04-03 15:59:19 | ['wife', 'rehearsal', 'dinner', 'wedding'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 34 | 3gFGEmRwa9cjgmcgyU8TlQ | YtSqYv1Q_pOltsVPSx54SA | 4 | 0.0 | 0.0 | 0.0 | 2010-11-27 04:26:33 | ['pleased', 'table', 'shrimp', 'salad', 'dinner'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 35 | B3bB1EVtODLW-15qBGgKFA | YtSqYv1Q_pOltsVPSx54SA | 5 | 0.0 | 0.0 | 0.0 | 2010-05-17 14:58:28 | ['great', 'family', 'Prime', 'Rib', 'meal', 'E... | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 36 | -nxuCi0p3qv1s8DkX2kINg | YtSqYv1Q_pOltsVPSx54SA | 3 | 0.0 | 0.0 | 0.0 | 2016-06-01 22:28:06 | ['good', 'Prime', 'Rib', 'era', 'service', 'Sa... | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 37 | X2CTpfyhB6S7WnDDpPnOdg | YtSqYv1Q_pOltsVPSx54SA | 3 | 1.0 | 0.0 | 0.0 | 2016-01-21 00:06:56 | ['service', 'bad', 'lamb', 'rack', 'good'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 38 | 9JpxZpcDPTGqIEBoz0IUBg | YtSqYv1Q_pOltsVPSx54SA | 4 | 2.0 | 1.0 | 0.0 | 2015-02-05 15:12:55 | ['place', 'service', 'Juicy', 'prime', 'rib'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 39 | p2SBD6gyw3djvrSh_Daz4Q | YtSqYv1Q_pOltsVPSx54SA | 4 | 3.0 | 1.0 | 4.0 | 2008-09-18 18:35:40 | ['Prime', 'Rib', 'Restaurant', 'Week'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 40 | QL8e1L1Vf1KDwoG2khGNGQ | YtSqYv1Q_pOltsVPSx54SA | 4 | 0.0 | 0.0 | 0.0 | 2018-04-29 12:19:46 | ['restaurant', 'place', 'Saturday', 'evening'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 41 | 5NrWXIeE_7LpF4moSpxtAA | YtSqYv1Q_pOltsVPSx54SA | 5 | 0.0 | 0.0 | 0.0 | 2011-08-26 00:03:01 | ['Warwick', 'restaurant', 'nights', 'dinner'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 42 | a90idd_P1oNf103f-nCzWg | YtSqYv1Q_pOltsVPSx54SA | 4 | 0.0 | 0.0 | 0.0 | 2012-02-05 13:16:55 | ['steak', 'good', 'see', 'quality', 'sides', '... | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 43 | qCvfeguEDTt1rCHlXp1BdA | YtSqYv1Q_pOltsVPSx54SA | 5 | 3.0 | 0.0 | 0.0 | 2014-09-27 21:25:32 | ['service', 'bar', 'love', 'Prime', 'Rib', 'El... | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 44 | jb_ZS62r2RO7wwFkPXTzjQ | YtSqYv1Q_pOltsVPSx54SA | 4 | 1.0 | 0.0 | 0.0 | 2010-11-28 16:50:51 | ['good', 'Make', 'bar', 'start'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 45 | KnH0ZhP3hlqoGftUwR4eaA | YtSqYv1Q_pOltsVPSx54SA | 5 | 1.0 | 0.0 | 0.0 | 2009-11-09 16:46:15 | ['Prime', 'Rib', 'sort', 'food'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 46 | ZhAKHsnlX2r_vrWFNJYZZw | YtSqYv1Q_pOltsVPSx54SA | 5 | 0.0 | 0.0 | 0.0 | 2012-05-11 22:52:33 | ['Prime', 'Rib', 'DP', 'special'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 47 | Fw-h3iTbzNC2BpWWQ6arFA | YtSqYv1Q_pOltsVPSx54SA | 4 | 0.0 | 0.0 | 1.0 | 2013-12-10 01:49:09 | ['place', 'dinner', 'room', 'eat', 'fancy', 'c... | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 48 | HVBOJN5lKvZf3LrjcxTY-A | YtSqYv1Q_pOltsVPSx54SA | 4 | 3.0 | 0.0 | 0.0 | 2016-09-19 18:47:12 | ['night', 'servers', 'Vinny', 'city'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
| 49 | sY6oP1czbvflB2WkLyIZ7g | YtSqYv1Q_pOltsVPSx54SA | 5 | 0.0 | 0.0 | 0.0 | 2018-06-11 04:49:55 | ['Love', 'feel'] | True | True | False | True | 3.0 | True | NaN | 'free' | 'full_bar' | False | True | False | u'dressy' | True | True | NaN | NaN | True | True | True | NaN | u'average' | NaN | NaN | NaN | False | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | 16:30-22:0 | Rittenhouse Grill | 19103 | 39.948949 | -75.169532 | 3.5 | 290 |
nightlife_businesses_PA["stars_x"].value_counts()
5 98064 4 74091 3 36519 1 28468 2 24536 Name: stars_x, dtype: int64
As we can see, the model is quite unbalanced since we found a clear difference between the two majority classes (rating 4 and 5) compared to the rest of the classes. We will see how it affects the model and based on the results we will consider using a sampling technique.
stars_ratings = [nightlife_businesses_PA['stars_x'] for review in nightlife_businesses_PA]
stars_ratings
[0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64,
0 5
1 3
2 4
3 5
4 5
..
261673 2
261674 5
261675 5
261676 5
261677 5
Name: stars_x, Length: 261678, dtype: int64]
Bearing in mind that we are making a sentiment analysis model, we will pass the variable stars_x to the model as the objective variable and the keywords to train the model.
So let's save these two variables.
keywords = nightlife_businesses_PA['keywords']
keywords
0 ['Tremendous', 'service', 'Big', 'shout', 'Dou...
1 ['good', 'place', 'lofty', 'prices', 'proporti...
2 ['bar', 'area', 'upscale', 'cities', 'restaura...
3 ['prime', 'rib', 'steak', 'joints', 'experience']
4 ['name', 'Best', 'Prime', 'Rib', 'town', 'Serv...
...
261673 ['family', 'special', 'occasion', 'help', 'dis...
261674 ['open', 'spot']
261675 ['beautiful', 'birthday', 'Lark', 'staff', 'de...
261676 ['good', 'Lark', 'anyone', 'get']
261677 ['food', 'perfect', 'rare', 'restaurant', 'goo...
Name: keywords, Length: 261678, dtype: object
# This vectorizer breaks text into single words and bi-grams
# and then calculates the TF-IDF representation
vectorizer = TfidfVectorizer(ngram_range=(1,1))
# the 'fit' builds up the vocabulary from all the reviews
# while the 'transform' step turns each indivdual text into
# a matrix of numbers.
vectors = vectorizer.fit_transform(keywords)
The TfidfVectorizer is a tool for preprocessing and transforming text data. It converts a collection of text documents to a matrix of numerical features by implementing the following steps:
Tokenization: The vectorizer first breaks down the text into individual words or tokens, in our case we have previously splitted the text into individual keywords.
N-grams: Next, the vectorizer constructs a vocabulary of all the unique n-grams (a contiguous sequence of n items from a given sample of text or speech) in the text data. As we have individual keywords or unigrams, which are, single words, we need to pass to the model a ngram range of (1,1). In the event that we had an entire text, we would have to pass it a range of (1,3).
The fit_transform method fits the vectorizer to the text data and then transforms it into a numerical feature matrix where keywords have a different weight based on the importance of the keyword.
vectors
<261678x24548 sparse matrix of type '<class 'numpy.float64'>' with 1335280 stored elements in Compressed Sparse Row format>
We split into train and test with a proportion of 80-20.
X_train, X_test, Y_train, Y_test = train_test_split(vectors, stars_ratings[1], test_size=0.20, random_state=seed,
shuffle =False,)
We have identified, based on previous research that we have conducted, that the following models perform quite well in this scenario.
classifiers = [
LogisticRegression(),
MultinomialNB(),
RandomForestClassifier(),
LinearSVC()
]
get_models_score(classifiers, X_train, Y_train)
LogisticRegression() model score: 0.485 Accuracy: 0.48492433506572913 MultinomialNB() model score: 0.463 Accuracy: 0.4634095077957811 RandomForestClassifier() model score: 0.465 Accuracy: 0.4652247019260165 LinearSVC() model score: 0.473 Accuracy: 0.47330709263222254
The logistic regression model gives us the best results, let´s see the confusion matrix.
LR = LogisticRegression()
LR = LogisticRegression(penalty='l2', C=1.0, random_state=seed, n_jobs=2)
LR.fit(X_train, Y_train)
y_pred = LR.predict(X_test)
print("model score: %.3f" % LR.score(X_test, Y_test))
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred))
<frozen importlib._bootstrap>:228: RuntimeWarning: scipy._lib.messagestream.MessageStream size changed, may indicate binary incompatibility. Expected 56 from C header, got 64 from PyObject
model score: 0.485 Accuracy: 0.48492433506572913
/Users/ignaciogonzalez/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:444: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
get_model_report(Y_test, y_pred)
Classification Report
precision recall f1-score support
1 0.56 0.52 0.54 6114
2 0.31 0.12 0.17 4795
3 0.35 0.14 0.20 7198
4 0.39 0.45 0.42 14270
5 0.56 0.71 0.62 19959
accuracy 0.48 52336
macro avg 0.43 0.39 0.39 52336
weighted avg 0.46 0.48 0.46 52336
As we have been anticipating, the results could be much better. However, we consider that the results obtained are not entirely bad since as the objective variable we have 5 categories or ratings. Therefore, we consider that it can be way more difficult to predict between 5 categories instead of 2.
get_confusion_matrix(LR, X_test, Y_test)
Confusion matrix, without normalization [[ 3159 448 327 829 1351] [ 1031 552 594 1318 1300] [ 542 374 1025 3080 2177] [ 412 198 682 6426 6552] [ 462 189 338 4753 14217]] Normalized confusion matrix [[0.51668302 0.07327445 0.05348381 0.13559045 0.22096827] [0.21501564 0.11511992 0.12387904 0.27486966 0.27111575] [0.07529869 0.05195888 0.14240067 0.42789664 0.30244512] [0.02887176 0.01387526 0.04779257 0.45031535 0.45914506] [0.02314745 0.00946941 0.01693472 0.23813818 0.71231024]]
As a conclusion, we can deduce that the fact that the objective variable is so unbalanced, it negatively affects the results of the model. This is because, although the model in general does not accurately predict all categories (1-5) we found that it misses a lot in categories 1-3, compared to 4-5. This is due, again, to the fact that it is quite unbalanced.
Therefore, we have decided to create another model in which we pass the ratings variable as the objective variable, but in this case we will pass two categories:
0 for ratings from 1-3 as if it were a bad experience or review
1 for ratings 4 and 5 as if it were a good experience.
columns_to_keep = ['stars_x', 'keywords']
df_second_model = nightlife_businesses_PA[columns_to_keep]
df_second_model
| stars_x | keywords | |
|---|---|---|
| 0 | 5 | ['Tremendous', 'service', 'Big', 'shout', 'Dou... |
| 1 | 3 | ['good', 'place', 'lofty', 'prices', 'proporti... |
| 2 | 4 | ['bar', 'area', 'upscale', 'cities', 'restaura... |
| 3 | 5 | ['prime', 'rib', 'steak', 'joints', 'experience'] |
| 4 | 5 | ['name', 'Best', 'Prime', 'Rib', 'town', 'Serv... |
| ... | ... | ... |
| 261673 | 2 | ['family', 'special', 'occasion', 'help', 'dis... |
| 261674 | 5 | ['open', 'spot'] |
| 261675 | 5 | ['beautiful', 'birthday', 'Lark', 'staff', 'de... |
| 261676 | 5 | ['good', 'Lark', 'anyone', 'get'] |
| 261677 | 5 | ['food', 'perfect', 'rare', 'restaurant', 'goo... |
261678 rows × 2 columns
nan_count = np.count_nonzero(df_second_model.isna())
print(nan_count)
0
As we can see the model doesnt include any nulls, thats why we decided before to not treat them until we have the use case and df prepared.
df_second_model['stars_x'] = df_second_model.apply(transform_ratings, axis=1)
We have created the transform_ratings function to replace de categories 1-5 for 0 and 1 based on the previous assumption.
df_second_model
| stars_x | keywords | |
|---|---|---|
| 0 | 1 | ['Tremendous', 'service', 'Big', 'shout', 'Dou... |
| 1 | 0 | ['good', 'place', 'lofty', 'prices', 'proporti... |
| 2 | 1 | ['bar', 'area', 'upscale', 'cities', 'restaura... |
| 3 | 1 | ['prime', 'rib', 'steak', 'joints', 'experience'] |
| 4 | 1 | ['name', 'Best', 'Prime', 'Rib', 'town', 'Serv... |
| ... | ... | ... |
| 261673 | 0 | ['family', 'special', 'occasion', 'help', 'dis... |
| 261674 | 1 | ['open', 'spot'] |
| 261675 | 1 | ['beautiful', 'birthday', 'Lark', 'staff', 'de... |
| 261676 | 1 | ['good', 'Lark', 'anyone', 'get'] |
| 261677 | 1 | ['food', 'perfect', 'rare', 'restaurant', 'goo... |
261678 rows × 2 columns
df_second_model["stars_x"].value_counts()
1 172155 0 89523 Name: stars_x, dtype: int64
We still have a quite unbalanced variable. Still we will test the model and observe the results. Therefore we create the two variables storing the ratings and keywords that will be passed to the models.
stars_ratings = [df_second_model['stars_x'] for review in df_second_model]
stars_ratings
[0 1
1 0
2 1
3 1
4 1
..
261673 0
261674 1
261675 1
261676 1
261677 1
Name: stars_x, Length: 261678, dtype: int64,
0 1
1 0
2 1
3 1
4 1
..
261673 0
261674 1
261675 1
261676 1
261677 1
Name: stars_x, Length: 261678, dtype: int64]
keywords = df_second_model['keywords']
keywords
0 ['Tremendous', 'service', 'Big', 'shout', 'Dou...
1 ['good', 'place', 'lofty', 'prices', 'proporti...
2 ['bar', 'area', 'upscale', 'cities', 'restaura...
3 ['prime', 'rib', 'steak', 'joints', 'experience']
4 ['name', 'Best', 'Prime', 'Rib', 'town', 'Serv...
...
261673 ['family', 'special', 'occasion', 'help', 'dis...
261674 ['open', 'spot']
261675 ['beautiful', 'birthday', 'Lark', 'staff', 'de...
261676 ['good', 'Lark', 'anyone', 'get']
261677 ['food', 'perfect', 'rare', 'restaurant', 'goo...
Name: keywords, Length: 261678, dtype: object
# This vectorizer breaks text into single words and uni-grams
# and then calculates the TF-IDF representation
vectorizer = TfidfVectorizer(ngram_range=(1,1))
# the 'fit' builds up the vocabulary from all the reviews
# while the 'transform' step turns each indivdual text into
# a matrix of numbers.
vectors = vectorizer.fit_transform(keywords)
X_train, X_test, Y_train, Y_test = train_test_split(vectors, stars_ratings[1], test_size=0.20, random_state=seed,
stratify= stars_ratings[1])
classifiers = [
LogisticRegression(),
MultinomialNB(),
RandomForestClassifier(),
LinearSVC()
]
get_models_score(classifiers, X_train, Y_train)
LogisticRegression() model score: 0.773 Accuracy: 0.7730243044940385 MultinomialNB() model score: 0.755 Accuracy: 0.7547386120452461 RandomForestClassifier() model score: 0.757 Accuracy: 0.7571079180678691 LinearSVC() model score: 0.765 Accuracy: 0.7648845918679303
We have obtained much better results. Obviously the model behaves much better with fewer categories than with more.
We choose logistic regression to observe the confusion matrix and the curves.
LR = LogisticRegression()
LR = LogisticRegression(penalty='l2', C=1.0, random_state=seed, n_jobs=2)
LR.fit(X_train, Y_train)
y_pred = LR.predict(X_test)
print("model score: %.3f" % LR.score(X_test, Y_test))
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred))
<frozen importlib._bootstrap>:228: RuntimeWarning: scipy._lib.messagestream.MessageStream size changed, may indicate binary incompatibility. Expected 56 from C header, got 64 from PyObject
model score: 0.774 Accuracy: 0.7735402017731581
/Users/ignaciogonzalez/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:444: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
get_model_report(Y_test, y_pred)
Classification Report
precision recall f1-score support
0 0.73 0.53 0.62 17905
1 0.79 0.90 0.84 34431
accuracy 0.77 52336
macro avg 0.76 0.72 0.73 52336
weighted avg 0.77 0.77 0.76 52336
In general we get better results compared to the previous model. Let´s see the confusion matrix
get_confusion_matrix(LR, X_test, Y_test)
Confusion matrix, without normalization [[ 9493 8412] [ 3440 30991]] Normalized confusion matrix [[0.5301871 0.4698129 ] [0.09990996 0.90009004]]
The results are not the best, however they are much better compared to the previous model.
We have a high probability of predicting good reviews based on keywords and a 50% probability of predicting a bad review. This is because the variable is unbalanced.
In the worst case scenario, which would be predicted as a good review but is actually bad, we still have a 50% chance. The model could be better however we are happy with the results.
The conclusion we can draw is that the imbalance of the dataset negatively affects the prediction of category 0
prob_predictions = LR.predict_proba(X_test)
yhat = prob_predictions[:, 1]
get_roc_curves(Y_test, yhat)
Best Threshold=0.660704, G-Mean=0.743
As we can see in the graph, the roc curve falls under the area at the top-left corner which indicates good performance levels. This also indicates how well the classifier distinguishes between positive and negative cases instead of falling to the right bottom corner which will lead to a bad performance model which will lead to a highly rate in predicting false positives. The second graph indicates the optimal threshold for this model which is 0.63%.
A G-mean of 0.743 is generally considered to be a good score, indicating that the classifier has a good balance between TPR and TNR.
precision, recall, thresholds = precision_recall_curve(Y_test, yhat)
get_precission_recall_curve(Y_test)
Best Threshold=0.435148, F-Score=0.843
In this case we got a nearly high AUC-PR score,which is the are under the precission recall curve. This curve, as the ROC curve gives us the weighted mean of precision achieved at each threshold. The optimal threshold given by this curve hints us that we are following a correct path as it gives as the same optimal threshold.
score = f1_score(Y_test, y_pred)
print('F-Score: %.5f' % score)
F-Score: 0.83948
The F1 score is a measure of a test's accuracy. It is calculated as the harmonic mean of the precision and recall of the test, with a higher score indicating a better balance between precision and recall.
An F1 score of 0.84 is generally considered to be a good score. It indicates that the classifier has a good balance between precision and recall.
skplt.metrics.plot_cumulative_gain(Y_test, prob_predictions)
plt.show()
According to the cumulative curve, if we approach 20% of our transaction base (x-axis), we will get over 30% of the ratings of category 1 predicted and nearly 50% of the ratings of the category 0. With 40% of the sample we will get 55% of the ratings of category 1 predicted and nearly 80% of category 0.
skplt.metrics.plot_lift_curve(Y_test, prob_predictions)
plt.show()
This curve informs us on how much better our model predicts than randomly guessing. For example, using the 20% of our best predictions, our model is about 1.5 and 2,25 times better(for cateogry 1 and 0 respectively) than randomly selecting 10% from our transaction pool.
fbeta_score(Y_test, y_pred, beta=beta)
0.8748242786249415
The F2 score is a measure of a test's accuracy that weighs recall higher than precision. 0,87 is a good score, with a good balance between precision and recall giving more weight to recall.
We established beta value as 2 to make recall value more important than a Precision one. It focuses on minimizing False Negatives than minimizing False Positives which it is what interest us. In this case, as we are correctly classifing samples(in the majority of the cases for this model),we are getting higher Precision and Recall scores which will give us a higher F-measure value.
Although we have obtained good results with the model, we have observed that the model heavily penalizes bad experiences or reviews. In other words, since it is quite unbalanced, the model has trouble predicting category 0.
Therefore, we are going to use an oversampling technique to try to get better results.
oversampler = RandomOverSampler(sampling_strategy='minority')
X_train, X_test, Y_train, Y_test = train_test_split(vectors, stars_ratings[1], test_size=0.20, random_state=seed,
stratify= stars_ratings[1])
X_train_oversampled, y_train_oversampled = oversampler.fit_resample(X_train, Y_train)
y_train_oversampled
0 1
1 0
2 1
3 1
4 0
..
275443 0
275444 0
275445 0
275446 0
275447 0
Name: stars_x, Length: 275448, dtype: int64
classifiers = [
LogisticRegression(),
MultinomialNB(),
RandomForestClassifier(),
LinearSVC()
]
get_models_score(classifiers, X_train_oversampled, y_train_oversampled)
LogisticRegression() model score: 0.734 Accuracy: 0.7344084377866096 MultinomialNB() model score: 0.733 Accuracy: 0.73318557016203 RandomForestClassifier() model score: 0.745 Accuracy: 0.7446690614490982 LinearSVC() model score: 0.727 Accuracy: 0.727128553959034
As far as we can see, we have not obtained better results in terms of model score and accuracy, Let´s see the confusion matrix
clf = RandomForestClassifier()
clf.fit(X_train_oversampled, y_train_oversampled)
y_pred = clf.predict(X_test)
print("model score: %.3f" % clf.score(X_test, Y_test))
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred))
model score: 0.748 Accuracy: 0.7481848058697645
get_model_report(Y_test, y_pred)
Classification Report
precision recall f1-score support
0 0.64 0.60 0.62 17905
1 0.80 0.83 0.81 34431
accuracy 0.75 52336
macro avg 0.72 0.71 0.72 52336
weighted avg 0.74 0.75 0.75 52336
The overall results in precision recall and f1 seem to be a bit worse, however let´s see the confusion matrix as we consider it is one of the most important metrics to observe.
get_confusion_matrix(clf, X_test, Y_test)
Confusion matrix, without normalization [[10672 7233] [ 5946 28485]] Normalized confusion matrix [[0.59603463 0.40396537] [0.17269321 0.82730679]]
As we can see, the oversampled model seems to predict category 1 worse but category 0 a little better. However, in the case that we had to choose one, taking into account the worst case scenario (which would be predicting a review as good but it is actually bad) you would be getting better results. Therefore, in this aspect the model behaves better since in the unbalanced model we would be failing in 8412 cases compared to 7233. The model would then be 7% better predicting the worst scenario. It is for this reason that if we had to choose a model we would consider staying with this one.
prob_predictions = clf.predict_proba(X_test)
yhat = prob_predictions[:, 1]
get_roc_curves(Y_test, yhat)
Best Threshold=0.610588, G-Mean=0.722
We got almost the same results compared to the previous model. We concluded that both models are very closed however the second model tends to perform better in the worst scenario.
precision, recall, thresholds = precision_recall_curve(Y_test, yhat)
get_precission_recall_curve(Y_test)
Best Threshold=0.350190, F-Score=0.830
score = f1_score(Y_test, y_pred)
print('F-Score: %.5f' % score)
F-Score: 0.81213
skplt.metrics.plot_cumulative_gain(Y_test, prob_predictions)
plt.show()
According to the cumulative gains curve, the balanced model is a bit worse as with a 20% of the samplle we will be explaining almost 30% of category 1 and 43% of cateogry 0 compared to the nearly 50% that we where explaining from the category 0 in the previous model.
skplt.metrics.plot_lift_curve(Y_test, prob_predictions)
plt.show()
We can also see reflected in this curve what we have been commenting on the previous curve analysis. With a 20% of the sample the balanced model is a bit worse predicting than randomly guessing.
fbeta_score(Y_test, y_pred, beta=beta)
0.821167883211679
According to the f1 and f2 scores the model is also worse. Therefore we have decided, even though that the model predicts a bit better in the worst scenario, to chose the unbalanced model for the interpretability section.
X_test
<52336x24548 sparse matrix of type '<class 'numpy.float64'>' with 267462 stored elements in Compressed Sparse Row format>
feature_names = vectorizer.get_feature_names_out()
explainer = shap.Explainer(LR, X_train)
shap_values = explainer.shap_values(X_test)
colour_test = pd.DataFrame(X_test.todense())
shap.summary_plot(shap_values, colour_test, feature_names=feature_names)
The summary plot above shows the top 20 features based on their feature importance for the predictions. The SHAP value on the x-axis shows whether the feature effected a higher or lower prediction probability. Each dot represents a different test observation and the colour of the dot is how important that feature was for that particular prediction.
As expected, many disaster-related words such as “great”, “delicous”, “amazing”, “love”, "excellent" and more are present in the top 20 feature set.
From this graph we can interpret that a high value in words like "great", "delicious", "excellent", for example leads us to categorize a review as a "good" experience, that is, to fall on the 1 category that would correspond to the review ratings with 4 or 5 stars.
On the other hand, words like "table", "minutes", "hour", "ok" leads us to categorize a review as a "bad" experience, that is, to fall on the zero category that would correspond to the review ratings with 1, 2 or 3 stars.
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[5,:],
colour_test.iloc[5,:], feature_names=feature_names)
In the force plot above, we have an example of a review prediction where the output value (i.e. the prediction for this observation) was 4.22. The base value is the predicted value if we did not have any knowledge of the features (it is the mean prediction value). Higher scores lead the model to predict 1 and lower scores lead the model to predict 0. So this particular review was ultimately classified as category (1), because they were pushed higher by all the factors shown in red
The red colour means that each feature pushed the prediction probability higher, whereas blue would have pushed the probability lower. Here we can see how words such as “wonderful” contributed to a higher good review prediction probability. Besides, words such us "table" have a negative impact on the prediction.
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[2760,:],
colour_test.iloc[2760,:], feature_names=feature_names)
In this case, we have an example of another review prediction where the output value was -1.98.
Here we can see how words such as “good” contributed to a higher good review prediction probability. Besides, words such us "bad", "service", "star" have had a bigger negative impact on the prediction pushing the prediction to a negative -1.98.